+++ /dev/null
-From: Carbo Kuo <byvoid@byvoid.com>
-Date: Thu, 25 Feb 2021 21:13:38 +0900
-Subject: Fix a severe performance bug in `Conversion::Convert` that caused
- O(N^2) complexity.
-
-In `Conversion.cpp`, line 27:
-```
- Optional<const DictEntry*> matched = dict->MatchPrefix(pstr);
-```
-pstr is a `const char*`. However, there is no overloaded function which parameter `const char*`.
-Therefore it matches `Optional<const DictEntry*> MatchPrefix(const std::string& word) const`.
-There is an implicit type conversion from `char*` to `std::string` with time complexity O(N).
-
-I added new benchmark tests. Before the fix:
-
-1: ------------------------------------------------------------------
-1: Benchmark Time CPU Iterations
-1: ------------------------------------------------------------------
-1: BM_Initialization/hk2s 1.17 ms 1.12 ms 645
-1: BM_Initialization/hk2t 0.116 ms 0.116 ms 5922
-1: BM_Initialization/jp2t 0.206 ms 0.201 ms 3500
-1: BM_Initialization/s2hk 18.2 ms 17.9 ms 40
-1: BM_Initialization/s2t 18.2 ms 18.1 ms 39
-1: BM_Initialization/s2tw 17.9 ms 17.8 ms 39
-1: BM_Initialization/s2twp 18.6 ms 18.4 ms 39
-1: BM_Initialization/t2hk 0.055 ms 0.054 ms 12907
-1: BM_Initialization/t2jp 0.120 ms 0.117 ms 5978
-1: BM_Initialization/t2s 0.988 ms 0.984 ms 710
-1: BM_Initialization/tw2s 1.08 ms 1.05 ms 672
-1: BM_Initialization/tw2sp 1.26 ms 1.24 ms 563
-1: BM_Initialization/tw2t 0.088 ms 0.083 ms 8528
-1: BM_Convert2M 413 ms 413 ms 2
-1: BM_Convert/100 1.09 ms 1.09 ms 629
-1: BM_Convert/1000 33.2 ms 33.2 ms 21
-1: BM_Convert/10000 2822 ms 2822 ms 1
-1: BM_Convert/100000 (took longer than 5 minutes, killed)
-
-Now:
-1: ------------------------------------------------------------------
-1: Benchmark Time CPU Iterations
-1: ------------------------------------------------------------------
-1: BM_Initialization/hk2s 1.07 ms 1.07 ms 650
-1: BM_Initialization/hk2t 0.114 ms 0.114 ms 6092
-1: BM_Initialization/jp2t 0.204 ms 0.200 ms 3503
-1: BM_Initialization/s2hk 18.2 ms 18.0 ms 40
-1: BM_Initialization/s2t 17.6 ms 17.6 ms 39
-1: BM_Initialization/s2tw 18.0 ms 17.9 ms 40
-1: BM_Initialization/s2twp 17.9 ms 17.9 ms 39
-1: BM_Initialization/t2hk 0.055 ms 0.055 ms 12855
-1: BM_Initialization/t2jp 0.125 ms 0.124 ms 5772
-1: BM_Initialization/t2s 1.000 ms 0.989 ms 695
-1: BM_Initialization/tw2s 1.09 ms 1.07 ms 668
-1: BM_Initialization/tw2sp 1.29 ms 1.26 ms 558
-1: BM_Initialization/tw2t 0.082 ms 0.082 ms 8558
-1: BM_Convert2M 361 ms 361 ms 2
-1: BM_Convert/100 0.741 ms 0.740 ms 948
-1: BM_Convert/1000 7.54 ms 7.52 ms 94
-1: BM_Convert/10000 76.3 ms 76.3 ms 9
-1: BM_Convert/100000 786 ms 786 ms 1
-
-This bug was reported in https://github.com/BYVoid/OpenCC/issues/478 and https://github.com/BYVoid/OpenCC/issues/517.
-
-Applied-Upstream: https://github.com/BYVoid/OpenCC/commit/c2e548e5e95c9a8ccc5c1e5feb259e8885ef32c6
----
- src/Dict.hpp | 7 ++++
- src/benchmark/Performance.cpp | 77 ++++++++++++++++++++++++++++++++++---------
- 2 files changed, 69 insertions(+), 15 deletions(-)
-
-diff --git a/src/Dict.hpp b/src/Dict.hpp
-index 461d6d2..1c81034 100644
---- a/src/Dict.hpp
-+++ b/src/Dict.hpp
-@@ -49,6 +49,13 @@ public:
- virtual Optional<const DictEntry*> MatchPrefix(const char* word,
- size_t len) const;
-
-+ /**
-+ * Matches the longest matched prefix of a word.
-+ */
-+ Optional<const DictEntry*> MatchPrefix(const char* word) const {
-+ return MatchPrefix(word, KeyMaxLength());
-+ }
-+
- /**
- * Matches the longest matched prefix of a word.
- */
-diff --git a/src/benchmark/Performance.cpp b/src/benchmark/Performance.cpp
-index cf8d3aa..d1b6468 100644
---- a/src/benchmark/Performance.cpp
-+++ b/src/benchmark/Performance.cpp
-@@ -1,7 +1,26 @@
-+/*
-+ * Open Chinese Convert
-+ *
-+ * Copyright 2020-2021 Carbo Kuo <byvoid@byvoid.com>
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ * http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+ */
-+
- #include <benchmark/benchmark.h>
- #include <fstream>
- #include <iostream>
- #include <memory>
-+#include <sstream>
- #include <streambuf>
-
- #ifdef _MSC_VER
-@@ -44,21 +63,32 @@ static void BM_Initialization(benchmark::State& state,
- state.ResumeTiming();
- }
- }
--BENCHMARK_CAPTURE(BM_Initialization, hk2s, "hk2s");
--BENCHMARK_CAPTURE(BM_Initialization, hk2t, "hk2t");
--BENCHMARK_CAPTURE(BM_Initialization, jp2t, "jp2t");
--BENCHMARK_CAPTURE(BM_Initialization, s2hk, "s2hk");
--BENCHMARK_CAPTURE(BM_Initialization, s2t, "s2t");
--BENCHMARK_CAPTURE(BM_Initialization, s2tw, "s2tw");
--BENCHMARK_CAPTURE(BM_Initialization, s2twp, "s2twp");
--BENCHMARK_CAPTURE(BM_Initialization, t2hk, "t2hk");
--BENCHMARK_CAPTURE(BM_Initialization, t2jp, "t2jp");
--BENCHMARK_CAPTURE(BM_Initialization, t2s, "t2s");
--BENCHMARK_CAPTURE(BM_Initialization, tw2s, "tw2s");
--BENCHMARK_CAPTURE(BM_Initialization, tw2sp, "tw2sp");
--BENCHMARK_CAPTURE(BM_Initialization, tw2t, "tw2t");
-+BENCHMARK_CAPTURE(BM_Initialization, hk2s, "hk2s")
-+ ->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Initialization, hk2t, "hk2t")
-+ ->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Initialization, jp2t, "jp2t")
-+ ->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Initialization, s2hk, "s2hk")
-+ ->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Initialization, s2t, "s2t")->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Initialization, s2tw, "s2tw")
-+ ->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Initialization, s2twp, "s2twp")
-+ ->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Initialization, t2hk, "t2hk")
-+ ->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Initialization, t2jp, "t2jp")
-+ ->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Initialization, t2s, "t2s")->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Initialization, tw2s, "tw2s")
-+ ->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Initialization, tw2sp, "tw2sp")
-+ ->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Initialization, tw2t, "tw2t")
-+ ->Unit(benchmark::kMillisecond);
-
--static void BM_Convert(benchmark::State& state) {
-+static void BM_Convert2M(benchmark::State& state) {
- const std::string config_name = "s2t";
- const std::string text = ReadText("zuozhuan.txt");
- const std::unique_ptr<SimpleConverter> converter(Initialize(config_name));
-@@ -66,7 +96,24 @@ static void BM_Convert(benchmark::State& state) {
- Convert(converter.get(), text);
- }
- }
--BENCHMARK(BM_Convert)->Unit(benchmark::kMillisecond);
-+BENCHMARK(BM_Convert2M)->Unit(benchmark::kMillisecond);
-+
-+static void BM_Convert(benchmark::State& state, int iteration) {
-+ std::ostringstream os;
-+ for (int i = 0; i < iteration; i++) {
-+ os << "Open Chinese Convert 開放中文轉換" << i << std::endl;
-+ }
-+ const std::string text = os.str();
-+ const std::string config_name = "s2t";
-+ const std::unique_ptr<SimpleConverter> converter(Initialize(config_name));
-+ for (auto _ : state) {
-+ Convert(converter.get(), text);
-+ }
-+}
-+BENCHMARK_CAPTURE(BM_Convert, 100, 100)->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Convert, 1000, 1000)->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Convert, 10000, 10000)->Unit(benchmark::kMillisecond);
-+BENCHMARK_CAPTURE(BM_Convert, 100000, 100000)->Unit(benchmark::kMillisecond);
-
- } // namespace opencc
-